Machine learning exploration


File descriptions

train.csv - the training set
test.csv - the test set
sample_submission.csv - a sample submission file in the correct format




Data fields

id - id of the creature
bone_length - average length of bone in the creature, normalized between 0 and 1
rotting_flesh - percentage of rotting flesh in the creature
hair_length - average hair length, normalized between 0 and 1
has_soul - percentage of soul in the creature
color - dominant color of the creature: ‘white’,‘black’,‘clear’,‘blue’,‘green’,‘blood’
type - target variable: ‘Ghost’, ‘Goblin’, and ‘Ghoul’




About Kaggle

In 2010, Kaggle was founded as a platform for predictive modelling and analytics competitions on which companies and researchers post their data and statisticians and data miners from all over the world compete to produce the best models.

This crowdsourcing approach relies on the fact that there are countless strategies that can be applied to any predictive modelling task and it is impossible to know at the outset which technique or analyst will be most effective. Kaggle also hosts recruiting competitions in which data scientists compete for a chance to interview at leading data science companies like Facebook, Winton Capital, and Walmart.




Data exploration

The data quality report

Summary

summary(train)
##        id         bone_length      rotting_flesh      hair_length    
##  Min.   :  0.0   Min.   :0.06103   Min.   :0.09569   Min.   :0.1346  
##  1st Qu.:205.5   1st Qu.:0.34001   1st Qu.:0.41481   1st Qu.:0.4074  
##  Median :458.0   Median :0.43489   Median :0.50155   Median :0.5386  
##  Mean   :443.7   Mean   :0.43416   Mean   :0.50685   Mean   :0.5291  
##  3rd Qu.:678.5   3rd Qu.:0.51722   3rd Qu.:0.60398   3rd Qu.:0.6472  
##  Max.   :897.0   Max.   :0.81700   Max.   :0.93247   Max.   :1.0000  
##     has_soul           color               type          
##  Min.   :0.009402   Length:371         Length:371        
##  1st Qu.:0.348002   Class :character   Class :character  
##  Median :0.466372   Mode  :character   Mode  :character  
##  Mean   :0.471392                                        
##  3rd Qu.:0.600610                                        
##  Max.   :0.935721                                        
##    Dataset         
##  Length:371        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 



Str

# Define factor
factor_V <- c('id', 'color', 'type')
train[factor_V] <- lapply(train[factor_V], function(x) as.factor(x))

str(train)
## 'data.frame':    371 obs. of  8 variables:
##  $ id           : Factor w/ 371 levels "0","1","2","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ bone_length  : num  0.355 0.576 0.468 0.777 0.566 ...
##  $ rotting_flesh: num  0.351 0.426 0.354 0.509 0.876 ...
##  $ hair_length  : num  0.466 0.531 0.812 0.637 0.419 ...
##  $ has_soul     : num  0.781 0.44 0.791 0.884 0.636 ...
##  $ color        : Factor w/ 6 levels "black","blood",..: 4 5 1 1 5 5 6 4 3 6 ...
##  $ type         : Factor w/ 3 levels "Ghost","Ghoul",..: 2 3 2 2 1 3 3 2 1 1 ...
##  $ Dataset      : chr  "train" "train" "train" "train" ...



First visualisations

Correlation

library(corrplot)
train_correlation <- train %>% select(bone_length:has_soul)
train_correlation <- cor(train_correlation)
# corrplot(train_correlation, method="circle")

# data 
corrplot.mixed(train_correlation)

#cor(train_correlation)

Comparison

pairs(train[,2:5], 
      col = train$type, 
      labels = c("Bone Length", "Rotting Flesh", "Hair Length", "Soul"))

Histogram

par(mfrow=c(1,3))
hist(train$bone_length,col="#3090C7", main = "bone_length") 
hist(train$rotting_flesh,col="#3090C7", main = "rotting_flesh")
hist(train$has_soul,col="#3090C7", main = "has_soul")

par(mfrow=c(1,2))
plot(train$color,col="#3090C7", main = "Color")
plot(train$type,col="#3090C7", main = "Type")

3D Plot

# Plot using plotly

p <- plot_ly(train, x = train$bone_length, y = train$rotting_flesh, z = train$has_soul, type = "scatter3d", mode = "markers", color=train$type)
p

Color histogram

ggplot(train, aes(color, fill = type)) + geom_bar()

Our feature don’t look easy to distinguisth… let’s try to create better features.




Feature engineering

By multiplying our variables together we should obtain better features to distinguish the classes.

1

# Sep1
full <- full %>%
    mutate(sep1 = bone_length * hair_length * has_soul,
          sep1 = sep1 / max(sep1))
ggplot(full, aes(id, sep1, color = type)) +
    geom_point()




2

full <- full %>%
    mutate(sep2 = sep1 / (rotting_flesh),
          sep2 = sep2 / max(sep2))
ggplot(full, aes(id, sep2, color = type)) +
    geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).




3

full <- full %>%
    mutate(allfeatures = ((bone_length^2) * (hair_length^4) * (has_soul^4))/rotting_flesh)
ggplot(full, aes(id, sep1, color = type)) +
    geom_point()




4

full <- full %>%
          mutate(hair_soul = hair_length * has_soul)

full <- full %>%
          mutate(bone_flesh = bone_length * rotting_flesh,
                 bone_hair = bone_length * hair_length,
                 bone_soul = bone_length * has_soul,
                 flesh_hair = rotting_flesh * hair_length,
                 flesh_soul = rotting_flesh * has_soul)

Modeling

Simple predictive modeling

Cross-Validation

# Cross-validation dataset
train_cv <- train

# Build the 3 levels
#Customer_cv$Long_term_value<-cut(Customer_cv$sum, c(0,100, 400, 40000))
#levels(Customer_cv$Long_term_value) <- c('low_value', 'medium_value', 'high_value')

# Set the target variable as a factor
#Customer_cv$Long_term_value <- as.factor(Customer_cv$Long_term_value)
#Customer_cv <- Customer_cv %>% select(age:Long_term_value)

# cross-validation 
# library(caret)
train_control<- trainControl(method="cv", number=8, repeats=5)
head(train_control)
## $method
## [1] "cv"
## 
## $number
## [1] 8
## 
## $repeats
## [1] 5
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL



Tree learning

library("rpart.plot")
## Loading required package: rpart
fit <- rpart(type ~ bone_length + rotting_flesh + hair_length + has_soul + color,
             method = "class",
             data = train_cv,
             control = rpart.control(minsplit = 50),
             parms = list(split='information'))

rpart.plot(fit, type=2, extra = 1)

library("rpart")
library("rpart.plot")

# train the model 
rpartmodel<- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_cv, trControl=train_control, method="rpart", control = rpart.control(minsplit = 1), parms = list(split='information'))

 # test to train the tree model with PCA:
rpartmodel<- train(type~bone_length + rotting_flesh + hair_length + has_soul, data=train_cv, trControl=train_control, method="rpart", control = rpart.control(minsplit = 1), preProcess = "pca", parms = list(split='information'))

# make predictions
predictions <- predict(rpartmodel,train_cv)
train_cv_tree<- cbind(train_cv,predictions)

# summarize results
confusionMatrix<- confusionMatrix(train_cv_tree$predictions,train_cv_tree$type)
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Ghost Ghoul Goblin
##     Ghost     92     0      6
##     Ghoul      0    79     16
##     Goblin    25    50    103
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7385          
##                  95% CI : (0.6907, 0.7825)
##     No Information Rate : 0.3477          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6074          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity                0.7863       0.6124        0.8240
## Specificity                0.9764       0.9339        0.6951
## Pos Pred Value             0.9388       0.8316        0.5787
## Neg Pred Value             0.9084       0.8188        0.8860
## Prevalence                 0.3154       0.3477        0.3369
## Detection Rate             0.2480       0.2129        0.2776
## Detection Prevalence       0.2642       0.2561        0.4798
## Balanced Accuracy          0.8814       0.7731        0.7596



Naives Bayes

library(e1071)
library(rminer)
# train the model 
e1071model <- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_cv, trControl=train_control, method="nb")
## Loading required package: klaR
# make predictions
predictions <- predict(e1071model,train_cv)
e1071modelbinded <- cbind(train_cv,predictions)
# summarize results
confusionMatrix<- confusionMatrix(e1071modelbinded$predictions,e1071modelbinded$type)
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Ghost Ghoul Goblin
##     Ghost     91     0      7
##     Ghoul      0   100     24
##     Goblin    26    29     94
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7682          
##                  95% CI : (0.7219, 0.8102)
##     No Information Rate : 0.3477          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6515          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity                0.7778       0.7752        0.7520
## Specificity                0.9724       0.9008        0.7764
## Pos Pred Value             0.9286       0.8065        0.6309
## Neg Pred Value             0.9048       0.8826        0.8604
## Prevalence                 0.3154       0.3477        0.3369
## Detection Rate             0.2453       0.2695        0.2534
## Detection Prevalence       0.2642       0.3342        0.4016
## Balanced Accuracy          0.8751       0.8380        0.7642



KNN

library(class)
# train the model 
knnFit <- train(type ~ bone_length + rotting_flesh + hair_length + has_soul + color, data = train_cv, method = "knn", trControl = train_control, preProcess = c("center","scale"), tuneLength = 10)
# make predictions
predictions<- predict(knnFit,train_cv)
knnFit_bind <- cbind(train_cv,predictions)
# summarize results
confusionMatrix<- confusionMatrix(knnFit_bind$predictions,knnFit_bind$type)
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Ghost Ghoul Goblin
##     Ghost    102     1     14
##     Ghoul      2   100     23
##     Goblin    13    28     88
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7817          
##                  95% CI : (0.7361, 0.8227)
##     No Information Rate : 0.3477          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6723          
##  Mcnemar's Test P-Value : 0.8349          
## 
## Statistics by Class:
## 
##                      Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity                0.8718       0.7752        0.7040
## Specificity                0.9409       0.8967        0.8333
## Pos Pred Value             0.8718       0.8000        0.6822
## Neg Pred Value             0.9409       0.8821        0.8471
## Prevalence                 0.3154       0.3477        0.3369
## Detection Rate             0.2749       0.2695        0.2372
## Detection Prevalence       0.3154       0.3369        0.3477
## Balanced Accuracy          0.9064       0.8359        0.7687

Comparing Model!

# load the library
library(mlbench)
# load the dataset
comp.train <- train %>% select(bone_length:type)
#data(PimaIndiansDiabetes)
# prepare training scheme
comp.control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the LVQ model (Learning Vector Quantization)
set.seed(7)
modelLvq <- train(type~., data=comp.train, method="lvq", trControl=comp.control)
# train the SVM model
set.seed(7)
modelSvm <- train(type~., data=comp.train, method="svmRadial", trControl=comp.control)
# train tree
set.seed(7)
modeltree <- train(type~., data=comp.train, method="rpart", trControl=comp.control)
# Tree + PCA
set.seed(7)
modeltreepca <- train(type~., data=comp.train, method="rpart", trControl=comp.control, preProcess = "pca", parms = list(split='information'))
# KNN
set.seed(7)
modelknn <- train(type~., data=comp.train, method="knn", trControl=comp.control)
# Bayes
set.seed(7)
modelbayes <- train(type~., data=comp.train, method="nb", trControl=comp.control)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
# collect resamples
results <- resamples(list(LVQ=modelLvq, SVM=modelSvm, TREE=modeltree, TREEPCA=modeltreepca, KNN=modelknn, NBayes=modelbayes))
# summarize the distributions
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LVQ, SVM, TREE, TREEPCA, KNN, NBayes 
## Number of resamples: 30 
## 
## Accuracy 
##           Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LVQ     0.3421  0.5329 0.5946 0.5843  0.6486 0.7368    0
## SVM     0.5135  0.6778 0.7297 0.7171  0.7568 0.8889    0
## TREE    0.5405  0.5833 0.6216 0.6286  0.6757 0.7632    0
## TREEPCA 0.5135  0.6038 0.6623 0.6567  0.6842 0.7838    0
## KNN     0.5833  0.6623 0.6842 0.6934  0.7133 0.8649    0
## NBayes  0.5946  0.6757 0.7027 0.7121  0.7566 0.8378    0
## 
## Kappa 
##            Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LVQ     0.01247  0.2977 0.3921 0.3749  0.4717 0.6062    0
## SVM     0.26650  0.5172 0.5941 0.5747  0.6336 0.8333    0
## TREE    0.30880  0.3766 0.4342 0.4441  0.5145 0.6463    0
## TREEPCA 0.26570  0.4087 0.4954 0.4856  0.5284 0.6754    0
## KNN     0.36920  0.4925 0.5265 0.5392  0.5698 0.7976    0
## NBayes  0.39080  0.5140 0.5523 0.5670  0.6340 0.7571    0



Combining model with ensemble methods

Bagging

We use multiple models (of the same kind) to aggregate and predict:

. Bagged CART
. Random Forest

# Example of Bagging algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"

# Bagged CART
set.seed(seed)
fit.treebag <- train(type~., data=comp.train, method="treebag", metric=metric, trControl=control)

# Random Forest
set.seed(seed)
fit.rf <- train(type~., data=comp.train, method="rf", metric=metric, trControl=control)

# summarize results
bagging_results <- resamples(list(treebag=fit.treebag, rf=fit.rf))
summary(bagging_results)
## 
## Call:
## summary.resamples(object = bagging_results)
## 
## Models: treebag, rf 
## Number of resamples: 30 
## 
## Accuracy 
##           Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## treebag 0.5263  0.6757 0.7027 0.7106  0.7568 0.8158    0
## rf      0.6216  0.6689 0.7297 0.7207  0.7568 0.8378    0
## 
## Kappa 
##           Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## treebag 0.2890  0.5132 0.5523 0.5658  0.6342 0.7241    0
## rf      0.4283  0.5007 0.5941 0.5806  0.6349 0.7568    0
dotplot(bagging_results)




Boosting

Boosting is as bagging but this time we focus on the mistakes done by the preciding models.

. C5.0
. Stochastic Gradient Boosting

library(mlbench)
library(caret)
library(caretEnsemble)

# Example of Boosting Algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
# C5.0
set.seed(seed)
## Error in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]) : there is no package called ‘partykit’
fit.c50 <- train(type~., data=comp.train, method="C5.0", metric=metric, trControl=control)
# Stochastic Gradient Boosting
set.seed(seed)
fit.gbm <- train(type~., data=comp.train, method="gbm", metric=metric, trControl=control, verbose=FALSE)
# summarize results
boosting_results <- resamples(list(c5.0=fit.c50, gbm=fit.gbm))
summary(boosting_results)
## 
## Call:
## summary.resamples(object = boosting_results)
## 
## Models: c5.0, gbm 
## Number of resamples: 30 
## 
## Accuracy 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## c5.0 0.5789  0.6601 0.7162 0.7122  0.7568 0.8378    0
## gbm  0.6486  0.7047 0.7434 0.7393  0.7568 0.8611    0
## 
## Kappa 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## c5.0 0.3692  0.4911 0.5741 0.5681  0.6344 0.7566    0
## gbm  0.4703  0.5570 0.6134 0.6085  0.6356 0.7919    0
dotplot(boosting_results)




Stacking

We use models of different types to aggregate and predict.

Example with: Linear Discriminate Analysis (LDA) Classification and Regression Trees (CART) k-Nearest Neighbors (kNN) Support Vector Machine with a Radial Basis Kernel Function (SVM)

# Example of Stacking algorithms
# create submodels
control <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
algorithmList <- c('lda', 'rpart', 'knn', 'svmRadial')

set.seed(seed)
models <- caretList(type~., data=comp.train, trControl=control, methodList=algorithmList)
results <- resamples(models)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, rpart, knn, svmRadial 
## Number of resamples: 30 
## 
## Accuracy 
##             Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## lda       0.5676  0.7027 0.7297 0.7303  0.7616 0.8611    0
## rpart     0.5405  0.5833 0.6216 0.6286  0.6757 0.7632    0
## knn       0.5946  0.6554 0.6842 0.6934  0.7193 0.8649    0
## svmRadial 0.5135  0.6842 0.7183 0.7170  0.7568 0.8889    0
## 
## Kappa 
##             Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## lda       0.3495  0.5518 0.5936 0.5948  0.6419 0.7919    0
## rpart     0.3088  0.3766 0.4342 0.4441  0.5145 0.6463    0
## knn       0.3888  0.4808 0.5260 0.5392  0.5801 0.7976    0
## svmRadial 0.2665  0.5265 0.5777 0.5745  0.6346 0.8333    0
dotplot(results)

# correlation between results
modelCor(results)
##                 lda     rpart       knn svmRadial
## lda       1.0000000 0.3515136 0.3736171 0.7924729
## rpart     0.3515136 1.0000000 0.1744603 0.2765570
## knn       0.3736171 0.1744603 1.0000000 0.5322076
## svmRadial 0.7924729 0.2765570 0.5322076 1.0000000
splom(results)

Let’s combine the predictions of the classifiers using a simple linear model.

# Thanks: http://machinelearningmastery.com/machine-learning-ensembles-with-r/ but not yet implemented for multiclass problems...
# stack using glm
# stackControl <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
# set.seed(seed)
# stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=stackControl)
# print(stack.glm)



Glmnet

# from https://www.kaggle.com/amberthomas/ghouls-goblins-and-ghosts-boo/ghosts-goblins-and-ghouls-oh-my 
set.seed(10)

### Clusters Without categorical variables
# Set the seed
set.seed(100)

# Extract creature labels and remove column from dataset
creature_labels <- full$type
full2 <- full
full2$type <- NULL

# Remove categorical variables (id, color, and dataset) from dataset
full2$id <- NULL
full2$color <- NULL
full2$Dataset <- NULL
full2 <- full2 %>% select(bone_length:has_soul,hair_soul:flesh_soul)
# Perform k-means clustering with 3 clusters, repeat 30 times
creature_km_1 <- kmeans(full2, 3, nstart = 30)

train_complete <- full[full$Dataset == 'train', ]
test_complete <- full[full$Dataset == 'test', ]

myControl <- trainControl(
      method = "cv", 
      number = 10,
      repeats = 20, 
      verboseIter = TRUE
      )


glm_model <- train(
    type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair + 
        bone_soul + flesh_hair + flesh_soul, 
    method = "glmnet",
    tuneGrid = expand.grid(alpha = 0:1,
      lambda = seq(0.0001, 1, length = 20)),
    data = train_complete,
    trControl = myControl
)
## + Fold01: alpha=0, lambda=1 
## - Fold01: alpha=0, lambda=1 
## + Fold01: alpha=1, lambda=1 
## - Fold01: alpha=1, lambda=1 
## + Fold02: alpha=0, lambda=1 
## - Fold02: alpha=0, lambda=1 
## + Fold02: alpha=1, lambda=1 
## - Fold02: alpha=1, lambda=1 
## + Fold03: alpha=0, lambda=1 
## - Fold03: alpha=0, lambda=1 
## + Fold03: alpha=1, lambda=1 
## - Fold03: alpha=1, lambda=1 
## + Fold04: alpha=0, lambda=1 
## - Fold04: alpha=0, lambda=1 
## + Fold04: alpha=1, lambda=1 
## - Fold04: alpha=1, lambda=1 
## + Fold05: alpha=0, lambda=1 
## - Fold05: alpha=0, lambda=1 
## + Fold05: alpha=1, lambda=1 
## - Fold05: alpha=1, lambda=1 
## + Fold06: alpha=0, lambda=1 
## - Fold06: alpha=0, lambda=1 
## + Fold06: alpha=1, lambda=1 
## - Fold06: alpha=1, lambda=1 
## + Fold07: alpha=0, lambda=1 
## - Fold07: alpha=0, lambda=1 
## + Fold07: alpha=1, lambda=1 
## - Fold07: alpha=1, lambda=1 
## + Fold08: alpha=0, lambda=1 
## - Fold08: alpha=0, lambda=1 
## + Fold08: alpha=1, lambda=1 
## - Fold08: alpha=1, lambda=1 
## + Fold09: alpha=0, lambda=1 
## - Fold09: alpha=0, lambda=1 
## + Fold09: alpha=1, lambda=1 
## - Fold09: alpha=1, lambda=1 
## + Fold10: alpha=0, lambda=1 
## - Fold10: alpha=0, lambda=1 
## + Fold10: alpha=1, lambda=1 
## - Fold10: alpha=1, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.0527 on full training set

Plot

library(fpc)
plotcluster(full2, creature_km_1$cluster)

table(creature_km_1$cluster, creature_labels)
##    creature_labels
##     Ghost Ghoul Goblin
##   1     7    39     75
##   2     4    86     24
##   3   106     4     26

Or:

model <- train(
   type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair + 
        bone_soul + flesh_hair + flesh_soul, 
       data = train_complete,

   tuneGrid = expand.grid(alpha = 0:1,
                          lambda = seq(0.0001, 1, length = 20)),
   method = "glmnet",
   trControl = myControl
)
## + Fold01: alpha=0, lambda=1 
## - Fold01: alpha=0, lambda=1 
## + Fold01: alpha=1, lambda=1 
## - Fold01: alpha=1, lambda=1 
## + Fold02: alpha=0, lambda=1 
## - Fold02: alpha=0, lambda=1 
## + Fold02: alpha=1, lambda=1 
## - Fold02: alpha=1, lambda=1 
## + Fold03: alpha=0, lambda=1 
## - Fold03: alpha=0, lambda=1 
## + Fold03: alpha=1, lambda=1 
## - Fold03: alpha=1, lambda=1 
## + Fold04: alpha=0, lambda=1 
## - Fold04: alpha=0, lambda=1 
## + Fold04: alpha=1, lambda=1 
## - Fold04: alpha=1, lambda=1 
## + Fold05: alpha=0, lambda=1 
## - Fold05: alpha=0, lambda=1 
## + Fold05: alpha=1, lambda=1 
## - Fold05: alpha=1, lambda=1 
## + Fold06: alpha=0, lambda=1 
## - Fold06: alpha=0, lambda=1 
## + Fold06: alpha=1, lambda=1 
## - Fold06: alpha=1, lambda=1 
## + Fold07: alpha=0, lambda=1 
## - Fold07: alpha=0, lambda=1 
## + Fold07: alpha=1, lambda=1 
## - Fold07: alpha=1, lambda=1 
## + Fold08: alpha=0, lambda=1 
## - Fold08: alpha=0, lambda=1 
## + Fold08: alpha=1, lambda=1 
## - Fold08: alpha=1, lambda=1 
## + Fold09: alpha=0, lambda=1 
## - Fold09: alpha=0, lambda=1 
## + Fold09: alpha=1, lambda=1 
## - Fold09: alpha=1, lambda=1 
## + Fold10: alpha=0, lambda=1 
## - Fold10: alpha=0, lambda=1 
## + Fold10: alpha=1, lambda=1 
## - Fold10: alpha=1, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.158 on full training set
# # Print model to console
model
## glmnet 
## 
## 371 samples
##  11 predictor
##   3 classes: 'Ghost', 'Ghoul', 'Goblin' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 333, 333, 335, 334, 334, 335, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda      Accuracy   Kappa    
##   0      0.00010000  0.7350640  0.6023770
##   0      0.05272632  0.7404734  0.6104269
##   0      0.10535263  0.7489529  0.6232589
##   0      0.15797895  0.7517307  0.6273562
##   0      0.21060526  0.7489529  0.6229377
##   0      0.26323158  0.7488818  0.6228710
##   0      0.31585789  0.7408408  0.6109715
##   0      0.36848421  0.7355066  0.6029439
##   0      0.42111053  0.7381381  0.6069389
##   0      0.47373684  0.7381381  0.6069389
##   0      0.52636316  0.7382843  0.6072288
##   0      0.57898947  0.7382843  0.6072288
##   0      0.63161579  0.7410621  0.6113954
##   0      0.68424211  0.7410621  0.6114630
##   0      0.73686842  0.7410621  0.6114630
##   0      0.78949474  0.7410621  0.6115241
##   0      0.84212105  0.7410621  0.6115241
##   0      0.89474737  0.7410621  0.6115241
##   0      0.94737368  0.7410621  0.6115241
##   0      1.00000000  0.7410621  0.6115241
##   1      0.00010000  0.7300221  0.5945355
##   1      0.05272632  0.7494705  0.6242450
##   1      0.10535263  0.7273985  0.5914999
##   1      0.15797895  0.6867631  0.5310673
##   1      0.21060526  0.6513988  0.4776070
##   1      0.26323158  0.6494942  0.4723063
##   1      0.31585789  0.3478031  0.0000000
##   1      0.36848421  0.3478031  0.0000000
##   1      0.42111053  0.3478031  0.0000000
##   1      0.47373684  0.3478031  0.0000000
##   1      0.52636316  0.3478031  0.0000000
##   1      0.57898947  0.3478031  0.0000000
##   1      0.63161579  0.3478031  0.0000000
##   1      0.68424211  0.3478031  0.0000000
##   1      0.73686842  0.3478031  0.0000000
##   1      0.78949474  0.3478031  0.0000000
##   1      0.84212105  0.3478031  0.0000000
##   1      0.89474737  0.3478031  0.0000000
##   1      0.94737368  0.3478031  0.0000000
##   1      1.00000000  0.3478031  0.0000000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.1579789.



Tune the best models

Promising models

# MOST PROMISING MODEL:
results <- resamples(list(GBM=fit.gbm, SVM=modelSvm, rf=fit.rf))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: GBM, SVM, rf 
## Number of resamples: 30 
## 
## Accuracy 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## GBM 0.6486  0.7047 0.7434 0.7393  0.7568 0.8611    0
## SVM 0.5135  0.6778 0.7297 0.7171  0.7568 0.8889    0
## rf  0.6216  0.6689 0.7297 0.7207  0.7568 0.8378    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## GBM 0.4703  0.5570 0.6134 0.6085  0.6356 0.7919    0
## SVM 0.2665  0.5172 0.5941 0.5747  0.6336 0.8333    0
## rf  0.4283  0.5007 0.5941 0.5806  0.6349 0.7568    0
dotplot(results)

Feature selection

## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry=10 
## - Fold01: mtry=10 
## + Fold01: mtry=18 
## - Fold01: mtry=18 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry=10 
## - Fold02: mtry=10 
## + Fold02: mtry=18 
## - Fold02: mtry=18 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry=10 
## - Fold03: mtry=10 
## + Fold03: mtry=18 
## - Fold03: mtry=18 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry=10 
## - Fold04: mtry=10 
## + Fold04: mtry=18 
## - Fold04: mtry=18 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry=10 
## - Fold05: mtry=10 
## + Fold05: mtry=18 
## - Fold05: mtry=18 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry=10 
## - Fold06: mtry=10 
## + Fold06: mtry=18 
## - Fold06: mtry=18 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry=10 
## - Fold07: mtry=10 
## + Fold07: mtry=18 
## - Fold07: mtry=18 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry=10 
## - Fold08: mtry=10 
## + Fold08: mtry=18 
## - Fold08: mtry=18 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry=10 
## - Fold09: mtry=10 
## + Fold09: mtry=18 
## - Fold09: mtry=18 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry=10 
## - Fold10: mtry=10 
## + Fold10: mtry=18 
## - Fold10: mtry=18 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
#1. check the importance of each variables
vimp <- varImp(rf_model)
# Plotting "vimp"
ggplot(vimp, top = dim(vimp$importance)[1])

train_correlation <- train_fe %>% select(bone_length:has_soul,sep1:flesh_soul) 
train_correlation <- cor(train_correlation) 
# corrplot(train_correlation, method="circle")

# data 
corrplot.mixed(train_correlation) 

#cor(train_correlation)

New RF

# 2.TUNE (http://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/)

set.seed(10)

rf_model <- train(
    type ~ bone_length + rotting_flesh + hair_length + has_soul + hair_soul + bone_flesh + bone_hair + 
        bone_soul + flesh_hair + flesh_soul,
    tuneLength = 3,
    data = train_fe, 
    method = "ranger", 
    trControl = myControl,
    importance = 'impurity'
)
## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry= 6 
## - Fold01: mtry= 6 
## + Fold01: mtry=10 
## - Fold01: mtry=10 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry= 6 
## - Fold02: mtry= 6 
## + Fold02: mtry=10 
## - Fold02: mtry=10 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry= 6 
## - Fold03: mtry= 6 
## + Fold03: mtry=10 
## - Fold03: mtry=10 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry= 6 
## - Fold04: mtry= 6 
## + Fold04: mtry=10 
## - Fold04: mtry=10 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry= 6 
## - Fold05: mtry= 6 
## + Fold05: mtry=10 
## - Fold05: mtry=10 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry= 6 
## - Fold06: mtry= 6 
## + Fold06: mtry=10 
## - Fold06: mtry=10 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry= 6 
## - Fold07: mtry= 6 
## + Fold07: mtry=10 
## - Fold07: mtry=10 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry= 6 
## - Fold08: mtry= 6 
## + Fold08: mtry=10 
## - Fold08: mtry=10 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry= 6 
## - Fold09: mtry= 6 
## + Fold09: mtry=10 
## - Fold09: mtry=10 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry= 6 
## - Fold10: mtry= 6 
## + Fold10: mtry=10 
## - Fold10: mtry=10 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 6 on full training set
print(rf_model)
## Random Forest 
## 
## 371 samples
##  10 predictor
##   3 classes: 'Ghost', 'Ghoul', 'Goblin' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 333, 333, 333, 333, 334, 334, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7167259  0.5746530
##    6    0.7195037  0.5790649
##   10    0.7192903  0.5785770
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 6.
plot(rf_model)

rf_model2 <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + allfeatures + bone_flesh + sep2 + 
        bone_soul + flesh_hair + flesh_soul,
    tuneLength = 3,
    data = train_fe, 
    method = "ranger", 
    trControl = myControl,
    importance = 'impurity'
)
## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry= 8 
## - Fold01: mtry= 8 
## + Fold01: mtry=15 
## - Fold01: mtry=15 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry= 8 
## - Fold02: mtry= 8 
## + Fold02: mtry=15 
## - Fold02: mtry=15 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry= 8 
## - Fold03: mtry= 8 
## + Fold03: mtry=15 
## - Fold03: mtry=15 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry= 8 
## - Fold04: mtry= 8 
## + Fold04: mtry=15 
## - Fold04: mtry=15 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry= 8 
## - Fold05: mtry= 8 
## + Fold05: mtry=15 
## - Fold05: mtry=15 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry= 8 
## - Fold06: mtry= 8 
## + Fold06: mtry=15 
## - Fold06: mtry=15 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry= 8 
## - Fold07: mtry= 8 
## + Fold07: mtry=15 
## - Fold07: mtry=15 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry= 8 
## - Fold08: mtry= 8 
## + Fold08: mtry=15 
## - Fold08: mtry=15 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry= 8 
## - Fold09: mtry= 8 
## + Fold09: mtry=15 
## - Fold09: mtry=15 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry= 8 
## - Fold10: mtry= 8 
## + Fold10: mtry=15 
## - Fold10: mtry=15 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 8 on full training set
print(rf_model)
## Random Forest 
## 
## 371 samples
##  10 predictor
##   3 classes: 'Ghost', 'Ghoul', 'Goblin' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 333, 333, 333, 333, 334, 334, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7167259  0.5746530
##    6    0.7195037  0.5790649
##   10    0.7192903  0.5785770
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 6.
plot(rf_model)

New GBM

# 1. TUNE http://stackoverflow.com/questions/15613332/using-caret-package-to-find-optimal-parameters-of-gbm - http://stackoverflow.com/questions/15613332/using-caret-package-to-find-optimal-parameters-of-gbm - http://stats.stackexchange.com/questions/141719/change-settings-in-the-prediction-model-caret-package 

set.seed(10)

glm_model <- train(
    type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair + 
        bone_soul + flesh_hair + flesh_soul, 
    method = "glmnet",
    tuneGrid = expand.grid(alpha = 0:1,
      lambda = seq(0.0001, 1, length = 20)),
    data = train_fe,
    trControl = myControl
)
## + Fold01: alpha=0, lambda=1 
## - Fold01: alpha=0, lambda=1 
## + Fold01: alpha=1, lambda=1 
## - Fold01: alpha=1, lambda=1 
## + Fold02: alpha=0, lambda=1 
## - Fold02: alpha=0, lambda=1 
## + Fold02: alpha=1, lambda=1 
## - Fold02: alpha=1, lambda=1 
## + Fold03: alpha=0, lambda=1 
## - Fold03: alpha=0, lambda=1 
## + Fold03: alpha=1, lambda=1 
## - Fold03: alpha=1, lambda=1 
## + Fold04: alpha=0, lambda=1 
## - Fold04: alpha=0, lambda=1 
## + Fold04: alpha=1, lambda=1 
## - Fold04: alpha=1, lambda=1 
## + Fold05: alpha=0, lambda=1 
## - Fold05: alpha=0, lambda=1 
## + Fold05: alpha=1, lambda=1 
## - Fold05: alpha=1, lambda=1 
## + Fold06: alpha=0, lambda=1 
## - Fold06: alpha=0, lambda=1 
## + Fold06: alpha=1, lambda=1 
## - Fold06: alpha=1, lambda=1 
## + Fold07: alpha=0, lambda=1 
## - Fold07: alpha=0, lambda=1 
## + Fold07: alpha=1, lambda=1 
## - Fold07: alpha=1, lambda=1 
## + Fold08: alpha=0, lambda=1 
## - Fold08: alpha=0, lambda=1 
## + Fold08: alpha=1, lambda=1 
## - Fold08: alpha=1, lambda=1 
## + Fold09: alpha=0, lambda=1 
## - Fold09: alpha=0, lambda=1 
## + Fold09: alpha=1, lambda=1 
## - Fold09: alpha=1, lambda=1 
## + Fold10: alpha=0, lambda=1 
## - Fold10: alpha=0, lambda=1 
## + Fold10: alpha=1, lambda=1 
## - Fold10: alpha=1, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.316 on full training set
glm_model2 <- train(
    type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + sep2 + 
        bone_soul + flesh_hair + flesh_soul, 
    method = "glmnet",
    tuneGrid = expand.grid(alpha = 0:1,
      lambda = seq(0.0001, 1, length = 20)),
    data = train_fe,
    trControl = myControl
)
## + Fold01: alpha=0, lambda=1 
## - Fold01: alpha=0, lambda=1 
## + Fold01: alpha=1, lambda=1 
## - Fold01: alpha=1, lambda=1 
## + Fold02: alpha=0, lambda=1 
## - Fold02: alpha=0, lambda=1 
## + Fold02: alpha=1, lambda=1 
## - Fold02: alpha=1, lambda=1 
## + Fold03: alpha=0, lambda=1 
## - Fold03: alpha=0, lambda=1 
## + Fold03: alpha=1, lambda=1 
## - Fold03: alpha=1, lambda=1 
## + Fold04: alpha=0, lambda=1 
## - Fold04: alpha=0, lambda=1 
## + Fold04: alpha=1, lambda=1 
## - Fold04: alpha=1, lambda=1 
## + Fold05: alpha=0, lambda=1 
## - Fold05: alpha=0, lambda=1 
## + Fold05: alpha=1, lambda=1 
## - Fold05: alpha=1, lambda=1 
## + Fold06: alpha=0, lambda=1 
## - Fold06: alpha=0, lambda=1 
## + Fold06: alpha=1, lambda=1 
## - Fold06: alpha=1, lambda=1 
## + Fold07: alpha=0, lambda=1 
## - Fold07: alpha=0, lambda=1 
## + Fold07: alpha=1, lambda=1 
## - Fold07: alpha=1, lambda=1 
## + Fold08: alpha=0, lambda=1 
## - Fold08: alpha=0, lambda=1 
## + Fold08: alpha=1, lambda=1 
## - Fold08: alpha=1, lambda=1 
## + Fold09: alpha=0, lambda=1 
## - Fold09: alpha=0, lambda=1 
## + Fold09: alpha=1, lambda=1 
## - Fold09: alpha=1, lambda=1 
## + Fold10: alpha=0, lambda=1 
## - Fold10: alpha=0, lambda=1 
## + Fold10: alpha=1, lambda=1 
## - Fold10: alpha=1, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 1e-04 on full training set
glm_model3 <- train(
    type ~ bone_length + rotting_flesh + hair_length + has_soul + color + allfeatures + bone_flesh + sep2 + 
        bone_soul + flesh_hair + flesh_soul, 
    method = "glmnet",
    tuneGrid = expand.grid(alpha = 0:1,
      lambda = seq(0.0001, 1, length = 20)),
    data = train_fe,
    trControl = myControl
)
## + Fold01: alpha=0, lambda=1 
## - Fold01: alpha=0, lambda=1 
## + Fold01: alpha=1, lambda=1 
## - Fold01: alpha=1, lambda=1 
## + Fold02: alpha=0, lambda=1 
## - Fold02: alpha=0, lambda=1 
## + Fold02: alpha=1, lambda=1 
## - Fold02: alpha=1, lambda=1 
## + Fold03: alpha=0, lambda=1 
## - Fold03: alpha=0, lambda=1 
## + Fold03: alpha=1, lambda=1 
## - Fold03: alpha=1, lambda=1 
## + Fold04: alpha=0, lambda=1 
## - Fold04: alpha=0, lambda=1 
## + Fold04: alpha=1, lambda=1 
## - Fold04: alpha=1, lambda=1 
## + Fold05: alpha=0, lambda=1 
## - Fold05: alpha=0, lambda=1 
## + Fold05: alpha=1, lambda=1 
## - Fold05: alpha=1, lambda=1 
## + Fold06: alpha=0, lambda=1 
## - Fold06: alpha=0, lambda=1 
## + Fold06: alpha=1, lambda=1 
## - Fold06: alpha=1, lambda=1 
## + Fold07: alpha=0, lambda=1 
## - Fold07: alpha=0, lambda=1 
## + Fold07: alpha=1, lambda=1 
## - Fold07: alpha=1, lambda=1 
## + Fold08: alpha=0, lambda=1 
## - Fold08: alpha=0, lambda=1 
## + Fold08: alpha=1, lambda=1 
## - Fold08: alpha=1, lambda=1 
## + Fold09: alpha=0, lambda=1 
## - Fold09: alpha=0, lambda=1 
## + Fold09: alpha=1, lambda=1 
## - Fold09: alpha=1, lambda=1 
## + Fold10: alpha=0, lambda=1 
## - Fold10: alpha=0, lambda=1 
## + Fold10: alpha=1, lambda=1 
## - Fold10: alpha=1, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.0527 on full training set
set.seed(10)
fit.gbm <- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_fe, method="gbm", metric=metric, trControl=control, verbose=FALSE)

Results

# summarize results
results <- resamples(list(glm=glm_model, rf=rf_model, rf2=rf_model2, glm2 =glm_model2, glm3=glm_model3))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: glm, rf, rf2, glm2, glm3 
## Number of resamples: 10 
## 
## Accuracy 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## glm  0.6842  0.7222 0.7333 0.7438  0.7616 0.8649    0
## rf   0.6667  0.6888 0.7105 0.7195  0.7500 0.7895    0
## rf2  0.6216  0.6888 0.7260 0.7203  0.7518 0.8333    0
## glm2 0.6579  0.7095 0.7468 0.7473  0.7989 0.8286    0
## glm3 0.6316  0.7153 0.7838 0.7636  0.8015 0.8649    0
## 
## Kappa 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## glm  0.5265  0.5833 0.5988 0.6156  0.6428 0.7965    0
## rf   0.4977  0.5337 0.5664 0.5791  0.6247 0.6837    0
## rf2  0.4301  0.5339 0.5902 0.5806  0.6276 0.7494    0
## glm2 0.4870  0.5640 0.6195 0.6204  0.6975 0.7420    0
## glm3 0.4470  0.5743 0.6740 0.6452  0.7036 0.7967    0
dotplot(results)

Send predictions to Kaggle:

# test <- read.csv("test.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
# 
# ## Make predictions
# ## Reorder the data by creature ID number
# test_complete <- full[full$Dataset == 'test', ]
# test_complete <- test_complete %>%
#                   arrange(id)
# 
# # Make predicted survival values
# my_prediction <- predict(glm_model, test_complete)
# solution <- data.frame(id = test_complete$id, Type = my_prediction)
# write.csv(solution, file = "solution.csv", row.names = FALSE)
# 
# # glm_model3
# my_prediction <- predict(glm_model3, test_complete)
# solution <- data.frame(id = test_complete$id, Type = my_prediction)
# write.csv(solution, file = "glm_model3.csv", row.names = FALSE)
# 
# 
# # # Bayes
# type <- predict(e1071model,test)
# bayes2 <- cbind(test, type)
# #write.csv(bayes2, file = "bayes.csv")
# 
# # Knn
# predictions<- predict(knnFit,test)
# results_knn <- cbind(test,predictions)
# #write.csv(results_knn, file = "knn.csv")
# 
# # modelGbm
# predictions<- predict(modelGbm,test)
# results_modelGbm <- cbind(test,predictions)
# write.csv(results_modelGbm, file = "gbm.csv")
#
# # Tree
# predictions <- predict(rpartmodel,test)
# train_cv_tree<- cbind(test,predictions)
# #write.csv(train_cv_tree, file = "tree.csv")
#
# ## Combination 
# type <- predict(rfmodel,test)
# RF <- cbind(test,type)
# RFprint <- RF %>% select(id, type)
# #write.csv(RFprint, file = "rf.csv")

# stackingmodel
# type <- predict(stackingmodel,test$type)
# RF <- cbind(stackingmodel,type)
# RFprint <- RF %>% select(id, type)



Archives

# # # principal component analysis
# # NOT avaible  
# # library(prcomp)
# prin_comp <- train %>% select(bone_length:has_soul)
# res.pca <- prcomp(prin_comp, scale = TRUE)
# 
# # Head
# head(unclass(res.pca$rotation)[, 1:4])
# 
# prin_comp <- prcomp(prin_comp, scale. = T)
# print(prin_comp)
# names(res.pca)
# 
# ## Variances of the principal components
# 
# # The variance retained by each principal component can be obtained as follow :
# 
# # Eigenvalues
# eig <- (res.pca$sdev)^2
# # Variances in percentage
# variance <- eig*100/sum(eig)
# # Cumulative variances
# cumvar <- cumsum(variance)
# train.pca <- data.frame(eig = eig, variance = variance,
#                      cumvariance = cumvar)
# head(train.pca)
# 
# # Or extract
# library("factoextra")
# eig.val <- get_eigenvalue(res.pca)
# head(eig.val)
# 
# # Variance 
# barplot(train.pca[, 2], names.arg=1:nrow(train.pca), 
#        main = "Variances",
#        xlab = "Principal Components",
#        ylab = "Percentage of variances",
#        col ="steelblue")
# 
# # Eigenvalue
# fviz_screeplot(res.pca, ncp=10, choice="eigenvalue")
# 
# fviz_pca_ind(res.pca, col.ind="cos2") +
# scale_color_gradient2(low="white", mid="blue", 
#     high="red", midpoint=0.50) + theme_minimal()
# 
# 
# fviz_pca_biplot(res.pca,  geom = "text") +
#   theme_minimal()